#' ---
#' title : " Project "
#' date : 4-12-2022
#' author : Karan, Sahil,Pranava,Vidhi
#'Importing packages
library(readr)
## Warning: package 'readr' was built under R version 4.2.2
library(data.table)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(rpart)
library(e1071)
library(party)
## Warning: package 'party' was built under R version 4.2.2
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.2.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.2
library(Epi)
## Warning: package 'Epi' was built under R version 4.2.2
library(ROCR)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:modeltools':
##
## Predict
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(VIM)
## Warning: package 'VIM' was built under R version 4.2.2
## Loading required package: colorspace
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(caTools)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:strucchange':
##
## boundary
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.2
#' Import Data
data = read_csv("C:/Users/karan/OneDrive/Desktop/IIT/Data Preparation and Analysis/Group Project/bank-full.csv")
## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl (7): age, balance, day, duration, campaign, pdays, previous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2 = data
head(data)
## # A tibble: 6 × 17
## age job marital educa…¹ default balance housing loan contact day month
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 58 manag… married tertia… no 2143 yes no unknown 5 may
## 2 44 techn… single second… no 29 yes no unknown 5 may
## 3 33 entre… married second… no 2 yes yes unknown 5 may
## 4 47 blue-… married unknown no 1506 yes no unknown 5 may
## 5 33 unkno… single unknown no 1 no no unknown 5 may
## 6 35 manag… married tertia… no 231 yes no unknown 5 may
## # … with 6 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## # previous <dbl>, poutcome <chr>, y <chr>, and abbreviated variable name
## # ¹education
str(data)
## spc_tbl_ [45,211 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:45211] 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr [1:45211] "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr [1:45211] "married" "single" "married" "married" ...
## $ education: chr [1:45211] "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr [1:45211] "no" "no" "no" "no" ...
## $ balance : num [1:45211] 2143 29 2 1506 1 ...
## $ housing : chr [1:45211] "yes" "yes" "yes" "yes" ...
## $ loan : chr [1:45211] "no" "no" "yes" "no" ...
## $ contact : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
## $ day : num [1:45211] 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr [1:45211] "may" "may" "may" "may" ...
## $ duration : num [1:45211] 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : num [1:45211] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : num [1:45211] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : num [1:45211] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr [1:45211] "no" "no" "no" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. balance = col_double(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. day = col_double(),
## .. month = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. y = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
#Checking dimensions of data
dim(data)
## [1] 45211 17
#' Exploratory Data Analysis
# Yes and Output Analysis
data %>%
count(y) %>%
mutate(perc = n / nrow(data)) -> predictorData2
ggplot(predictorData2, aes(x = y, y = perc)) + geom_bar(stat = "identity")

# Job type analysis
data %>%
count(job) %>%
mutate(perc = n / nrow(data)) -> bank3
ggplot(bank3, aes(x = job, y = perc)) + geom_bar(stat = "identity")

# Job type analysis with output field
ggplot(data,
aes(x = job,
fill = y)) +
geom_bar(position = "dodge")

# Marital type analysis
data %>%
count(marital) %>%
mutate(perc = n / nrow(data)) -> bank3
ggplot(bank3, aes(x = marital, y = perc)) + geom_bar(stat = "identity")

# Marital type analysis with output field
ggplot(data,
aes(x = marital,
fill = y)) +
geom_bar(position = "dodge")

# Defaulter by bank field analysis
data %>%
count(default) %>%
mutate(perc = n / nrow(data)) -> bank3
ggplot(bank3, aes(x = default, y = perc)) + geom_bar(stat = "identity")

# Defaulter by bank analysis with output field
ggplot(data,
aes(x = default,
fill = y)) +
geom_bar(position = "dodge")

Month fields analysis shows in which month bank contact more with
customers
Month Note- day of the week is required calculation and its not
useful data as usually in any banking sector performance is calculated
monthly and quarterly or yearly.
data %>%
count(month) %>%
mutate(perc = n / nrow(data),Month = factor(month, levels = c("jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec"))) -> bank3
ggplot(bank3, aes(x = Month, y = perc)) + geom_bar(stat = "identity")

# Month fields analysis shows in which month bank contact more with customers with output fields
bankMutate <- data %>%
mutate(Month = factor(month, levels = c("jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec")))
ggplot(bankMutate,
aes(x = Month,
fill = y)) +
geom_bar(position = "dodge")

# Call duration analysis
ggplot_duration<- ggplotly(ggplot(data, aes(x=as.factor(y), y=duration)) +
geom_boxplot(fill='#A4A4A4', color="black")+
theme_classic())
ggplot_duration
#'' Data Manipulation and Descriptive Analysis
# Unique values in different columns
data%>% distinct(job)
## # A tibble: 12 × 1
## job
## <chr>
## 1 management
## 2 technician
## 3 entrepreneur
## 4 blue-collar
## 5 unknown
## 6 retired
## 7 admin.
## 8 services
## 9 self-employed
## 10 unemployed
## 11 housemaid
## 12 student
data %>% distinct(marital)
## # A tibble: 3 × 1
## marital
## <chr>
## 1 married
## 2 single
## 3 divorced
data %>% distinct(education)
## # A tibble: 4 × 1
## education
## <chr>
## 1 tertiary
## 2 secondary
## 3 unknown
## 4 primary
data %>% distinct(default)
## # A tibble: 2 × 1
## default
## <chr>
## 1 no
## 2 yes
data %>% distinct(loan)
## # A tibble: 2 × 1
## loan
## <chr>
## 1 no
## 2 yes
data %>% distinct(contact)
## # A tibble: 3 × 1
## contact
## <chr>
## 1 unknown
## 2 cellular
## 3 telephone
data %>% distinct(poutcome)
## # A tibble: 4 × 1
## poutcome
## <chr>
## 1 unknown
## 2 failure
## 3 other
## 4 success
data %>% distinct(y)
## # A tibble: 2 × 1
## y
## <chr>
## 1 no
## 2 yes
# Job Column - Spelling error
data$job<-gsub(".", "", data$job, fixed = TRUE)
# Checking missing values in data set
colSums(is.na.data.frame(data))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# No missing values in data
# Substitute unknowns in the data set with NA
data[data == "unknown"] <- NA
#' Data Preparation
# Converting categorical target variable in yes/no form:
data$y<-ifelse(data$y=="yes",1,0)
#' Splitting Data
set.seed(1)
# Use 70% of data set as training set and 30% as test set
sample <- sample.split(data$y, SplitRatio = 0.7)
train <- subset(data, sample == TRUE)
test <- subset(data, sample == FALSE)
train<-train %>% mutate(y=as.factor(y))
test<-test %>% mutate(y=as.factor(y))
# Splitting train into X and Y train
x_train<-as.data.frame(train) %>% select(-y)
y_train<-train$y
# Splitting test into X and Y train
x_test<-as.data.frame(test) %>% select(-y)
y_test<-test$y
#' Imbalanced Data set
# We need to balance our data
table(y_train)
## y_train
## 0 1
## 27945 3702
set.seed(123)
train_downsample <- downSample(x = x_train,y = y_train,yname = "y")
train_downsample<-train_downsample %>% mutate(y=as.factor(y))
#' Imbalanced Data set
# We need to balance our data
table(y_train)
## y_train
## 0 1
## 27945 3702
set.seed(123)
train_downsample <- downSample(x = x_train,y = y_train,yname = "y")
train_downsample<-train_downsample %>% mutate(y=as.factor(y))
#' Logistic Regression
model_full<-glm(y~.,train_downsample, family = "binomial")
summary(model_full)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_downsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6981 -0.4844 0.2424 0.5487 2.1639
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.456e+00 6.724e-01 -3.653 0.000260 ***
## age 4.024e-03 8.902e-03 0.452 0.651238
## jobblue-collar -2.201e-01 2.652e-01 -0.830 0.406624
## jobentrepreneur -5.843e-01 5.021e-01 -1.164 0.244518
## jobhousemaid -6.426e-01 5.536e-01 -1.161 0.245738
## jobmanagement -1.596e-01 2.761e-01 -0.578 0.563272
## jobretired -1.684e-01 3.785e-01 -0.445 0.656442
## jobself-employed -1.630e-01 4.157e-01 -0.392 0.694892
## jobservices 4.076e-02 3.258e-01 0.125 0.900445
## jobstudent 3.603e-01 4.325e-01 0.833 0.404749
## jobtechnician 4.431e-02 2.477e-01 0.179 0.858010
## jobunemployed 8.166e-02 4.446e-01 0.184 0.854267
## maritalmarried 2.262e-01 2.228e-01 1.015 0.309973
## maritalsingle 2.094e-01 2.622e-01 0.799 0.424577
## educationsecondary 3.454e-01 2.534e-01 1.363 0.172823
## educationtertiary 9.190e-01 3.014e-01 3.049 0.002299 **
## defaultyes -9.174e-01 1.052e+00 -0.872 0.383352
## balance 8.469e-06 2.342e-05 0.362 0.717678
## housingyes -1.092e+00 1.647e-01 -6.631 3.33e-11 ***
## loanyes -2.390e-01 2.389e-01 -1.000 0.317081
## contacttelephone -3.651e-01 2.826e-01 -1.292 0.196417
## day 1.480e-02 9.826e-03 1.506 0.131949
## monthaug 1.307e+00 3.224e-01 4.053 5.05e-05 ***
## monthdec 3.021e-01 5.177e-01 0.584 0.559528
## monthfeb 3.399e-01 3.038e-01 1.119 0.263174
## monthjan -4.140e-01 3.729e-01 -1.110 0.266952
## monthjul 1.707e+00 4.675e-01 3.652 0.000260 ***
## monthjun 1.783e+00 4.197e-01 4.248 2.16e-05 ***
## monthmar 1.998e+00 5.096e-01 3.920 8.85e-05 ***
## monthmay -1.317e-01 2.539e-01 -0.519 0.603968
## monthnov 2.330e-01 2.882e-01 0.809 0.418800
## monthoct 1.023e+00 3.435e-01 2.977 0.002911 **
## monthsep 1.242e+00 3.617e-01 3.434 0.000594 ***
## duration 6.145e-03 4.399e-04 13.970 < 2e-16 ***
## campaign -1.788e-01 5.664e-02 -3.156 0.001600 **
## pdays 7.503e-04 6.122e-04 1.226 0.220374
## previous 2.260e-02 2.056e-02 1.099 0.271773
## poutcomeother 2.558e-01 1.822e-01 1.404 0.160314
## poutcomesuccess 2.051e+00 1.849e-01 11.095 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2278.2 on 1821 degrees of freedom
## Residual deviance: 1331.7 on 1783 degrees of freedom
## (5582 observations deleted due to missingness)
## AIC: 1409.7
##
## Number of Fisher Scoring iterations: 6
# Important coefficients in Model
varImp(model_full, scale = FALSE)
## Overall
## age 0.4520434
## jobblue-collar 0.8298500
## jobentrepreneur 1.1637675
## jobhousemaid 1.1607643
## jobmanagement 0.5779882
## jobretired 0.4448314
## jobself-employed 0.3922245
## jobservices 0.1250987
## jobstudent 0.8331692
## jobtechnician 0.1789080
## jobunemployed 0.1836766
## maritalmarried 1.0152796
## maritalsingle 0.7985060
## educationsecondary 1.3631889
## educationtertiary 3.0486142
## defaultyes 0.8717375
## balance 0.3615638
## housingyes 6.6313529
## loanyes 1.0004740
## contacttelephone 1.2918284
## day 1.5064608
## monthaug 4.0531281
## monthdec 0.5835426
## monthfeb 1.1189196
## monthjan 1.1101095
## monthjul 3.6518209
## monthjun 4.2477855
## monthmar 3.9200684
## monthmay 0.5187031
## monthnov 0.8085047
## monthoct 2.9769592
## monthsep 3.4344076
## duration 13.9696739
## campaign 3.1559215
## pdays 1.2255330
## previous 1.0989886
## poutcomeother 1.4040177
## poutcomesuccess 11.0951074
# Predictions
predicted_log<-predict(model_full,x_test,type="response")
pred_log_test <- ifelse(predicted_log > 0.5, 1, 0) %>% as.factor()
cm_log<-confusionMatrix(pred_log_test,y_test,positive = "1")
cm_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1339 53
## 1 487 490
##
## Accuracy : 0.7721
## 95% CI : (0.7546, 0.7888)
## No Information Rate : 0.7708
## P-Value [Acc > NIR] : 0.4531
##
## Kappa : 0.4963
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9024
## Specificity : 0.7333
## Pos Pred Value : 0.5015
## Neg Pred Value : 0.9619
## Prevalence : 0.2292
## Detection Rate : 0.2068
## Detection Prevalence : 0.4124
## Balanced Accuracy : 0.8178
##
## 'Positive' Class : 1
##
#' Naive Bayes Classifier
model_naive <- naiveBayes(formula = y ~ ., data = train_downsample,laplace=1)
summary(model_naive)
## Length Class Mode
## apriori 2 table numeric
## tables 16 -none- list
## levels 2 -none- character
## isnumeric 16 -none- logical
## call 4 -none- call
# Predictions
naive_pred = predict(model_naive,x_test,type="class" )
naive_log = confusionMatrix(table(naive_pred,y_test))
naive_log
## Confusion Matrix and Statistics
##
## y_test
## naive_pred 0 1
## 0 10084 415
## 1 1893 1172
##
## Accuracy : 0.8298
## 95% CI : (0.8234, 0.8361)
## No Information Rate : 0.883
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4134
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8419
## Specificity : 0.7385
## Pos Pred Value : 0.9605
## Neg Pred Value : 0.3824
## Prevalence : 0.8830
## Detection Rate : 0.7434
## Detection Prevalence : 0.7740
## Balanced Accuracy : 0.7902
##
## 'Positive' Class : 0
##